We start by loading the previous “prep” file and check if the data is the same.

df <- read.csv("FPdata_PREP.csv", stringsAsFactors = TRUE)
str(df)
## 'data.frame':    426043 obs. of  40 variables:
##  $ State                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender                   : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ GeneralHealth            : int  4 5 4 5 2 1 4 3 3 3 ...
##  $ PhysicalHealthDays       : int  0 0 2 0 2 1 0 0 0 1 ...
##  $ MentalHealthDays         : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ LastCheckupTime          : int  5 1 5 5 5 5 5 5 5 5 ...
##  $ PhysicalActivities       : int  0 0 1 1 1 0 1 0 1 1 ...
##  $ SleepHours               : int  8 6 5 7 9 7 7 8 6 7 ...
##  $ RemovedTeeth             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HadHeartAttack           : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ HadAngina                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HadStroke                : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ HadAsthma                : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ HadSkinCancer            : int  0 1 1 0 0 0 0 0 1 0 ...
##  $ HadCOPD                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HadDepressiveDisorder    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HadKidneyDisease         : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ HadArthritis             : int  0 0 0 1 0 0 0 1 1 0 ...
##  $ HadDiabetes              : int  1 0 0 0 0 1 0 0 0 1 ...
##  $ DeafOrHardOfHearing      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BlindOrVisionDifficulty  : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ DifficultyConcentrating  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DifficultyWalking        : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ DifficultyDressingBathing: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DifficultyErrands        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SmokerStatus             : int  0 0 0 1 0 0 1 0 1 0 ...
##  $ ECigaretteUsage          : int  0 1 1 1 1 1 1 1 0 1 ...
##  $ ChestScan                : int  0 0 0 1 1 0 0 1 0 0 ...
##  $ RaceEthnicityCategory    : Factor w/ 6 levels "0 - NA's","Black only, Non-Hispanic",..: 6 6 6 6 6 6 2 6 6 6 ...
##  $ AgeCategory              : int  13 13 8 10 5 13 13 13 12 11 ...
##  $ HeightInMeters           : num  1.68 1.6 1.57 1.65 1.57 1.8 1.65 1.63 1.7 1.68 ...
##  $ WeightInKilograms        : num  90.7 68 63.5 63.5 54 ...
##  $ BMI                      : num  26.6 26.6 25.6 23.3 21.8 ...
##  $ AlcoholDrinkers          : int  0 0 0 0 1 0 1 0 0 1 ...
##  $ HIVTesting               : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ FluVaxLast12             : int  1 0 0 1 0 0 0 1 0 1 ...
##  $ PneumoVaxEver            : int  0 0 0 1 1 1 0 1 0 1 ...
##  $ TetanusLast10Tdap        : int  1 1 0 1 1 1 1 1 1 1 ...
##  $ HighRiskLastYear         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CovidPos                 : int  0 0 1 0 0 0 0 0 0 0 ...
summary(df)
##      State          Gender       GeneralHealth   PhysicalHealthDays
##  Min.   : 1.0   Min.   :0.0000   Min.   :1.000   Min.   : 0.000    
##  1st Qu.:16.0   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.: 0.000    
##  Median :27.0   Median :0.0000   Median :4.000   Median : 0.000    
##  Mean   :28.4   Mean   :0.4745   Mean   :3.466   Mean   : 4.076    
##  3rd Qu.:43.0   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.: 3.000    
##  Max.   :54.0   Max.   :1.0000   Max.   :5.000   Max.   :30.000    
##  MentalHealthDays LastCheckupTime PhysicalActivities   SleepHours    
##  Min.   : 0.000   Min.   :1.000   Min.   :0.0000     Min.   : 1.000  
##  1st Qu.: 0.000   1st Qu.:5.000   1st Qu.:1.0000     1st Qu.: 6.000  
##  Median : 0.000   Median :5.000   Median :1.0000     Median : 7.000  
##  Mean   : 4.158   Mean   :4.548   Mean   :0.7697     Mean   : 7.031  
##  3rd Qu.: 4.000   3rd Qu.:5.000   3rd Qu.:1.0000     3rd Qu.: 8.000  
##  Max.   :30.000   Max.   :5.000   Max.   :1.0000     Max.   :24.000  
##   RemovedTeeth    HadHeartAttack      HadAngina         HadStroke      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.4712   Mean   :0.05596   Mean   :0.05893   Mean   :0.04286  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##    HadAsthma     HadSkinCancer        HadCOPD        HadDepressiveDisorder
##  Min.   :0.000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000       
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000       
##  Median :0.000   Median :0.00000   Median :0.00000   Median :0.0000       
##  Mean   :0.144   Mean   :0.08114   Mean   :0.07712   Mean   :0.1981       
##  3rd Qu.:0.000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000       
##  Max.   :1.000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000       
##  HadKidneyDisease   HadArthritis    HadDiabetes     DeafOrHardOfHearing
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.00000    
##  1st Qu.:0.00000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.00000    
##  Median :0.00000   Median :0.000   Median :0.0000   Median :0.00000    
##  Mean   :0.04431   Mean   :0.333   Mean   :0.1389   Mean   :0.08754    
##  3rd Qu.:0.00000   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:0.00000    
##  Max.   :1.00000   Max.   :1.000   Max.   :1.0000   Max.   :1.00000    
##  BlindOrVisionDifficulty DifficultyConcentrating DifficultyWalking
##  Min.   :0.00000         Min.   :0.0000          Min.   :0.0000   
##  1st Qu.:0.00000         1st Qu.:0.0000          1st Qu.:0.0000   
##  Median :0.00000         Median :0.0000          Median :0.0000   
##  Mean   :0.05174         Mean   :0.1085          Mean   :0.1427   
##  3rd Qu.:0.00000         3rd Qu.:0.0000          3rd Qu.:0.0000   
##  Max.   :1.00000         Max.   :1.0000          Max.   :1.0000   
##  DifficultyDressingBathing DifficultyErrands  SmokerStatus    ECigaretteUsage 
##  Min.   :0.00000           Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000           1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.00000           Median :0.00000   Median :0.0000   Median :1.0000  
##  Mean   :0.03426           Mean   :0.06833   Mean   :0.3662   Mean   :0.7501  
##  3rd Qu.:0.00000           3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.00000           Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##    ChestScan                        RaceEthnicityCategory  AgeCategory    
##  Min.   :0.0000   0 - NA's                     : 13565    Min.   : 1.000  
##  1st Qu.:0.0000   Black only, Non-Hispanic     : 32934    1st Qu.: 5.000  
##  Median :0.0000   Hispanic                     : 40998    Median : 8.000  
##  Mean   :0.3684   Multiracial, Non-Hispanic    :  9024    Mean   : 7.705  
##  3rd Qu.:1.0000   Other race only, Non-Hispanic: 22004    3rd Qu.:11.000  
##  Max.   :1.0000   White only, Non-Hispanic     :307518    Max.   :13.000  
##  HeightInMeters  WeightInKilograms      BMI        AlcoholDrinkers 
##  Min.   :0.910   Min.   : 22.68    Min.   :14.51   Min.   :0.0000  
##  1st Qu.:1.630   1st Qu.: 68.04    1st Qu.:24.37   1st Qu.:0.0000  
##  Median :1.680   Median : 81.65    Median :26.63   Median :1.0000  
##  Mean   :1.702   Mean   : 81.74    Mean   :27.53   Mean   :0.5843  
##  3rd Qu.:1.780   3rd Qu.: 90.72    3rd Qu.:30.34   3rd Qu.:1.0000  
##  Max.   :2.410   Max.   :285.00    Max.   :40.91   Max.   :1.0000  
##    HIVTesting      FluVaxLast12    PneumoVaxEver    TetanusLast10Tdap
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000   
##  Median :0.0000   Median :1.0000   Median :0.0000   Median :1.0000   
##  Mean   :0.2866   Mean   :0.5779   Mean   :0.3413   Mean   :0.8125   
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   
##  HighRiskLastYear     CovidPos     
##  Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000  
##  Mean   :0.03821   Mean   :0.2757  
##  3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.0000
levels(df$RaceEthnicityCategory)
## [1] "0 - NA's"                      "Black only, Non-Hispanic"     
## [3] "Hispanic"                      "Multiracial, Non-Hispanic"    
## [5] "Other race only, Non-Hispanic" "White only, Non-Hispanic"

First of all we have to deal with the NA’s in RaceEthnicityCategory as we kept it to avoid multicollinearity on regression models.

print("Total row count:")
## [1] "Total row count:"
prev_rows <- nrow(df)
print(prev_rows)
## [1] 426043
print("RaceEthnicityCategory level count:")
## [1] "RaceEthnicityCategory level count:"
table(df$RaceEthnicityCategory)
## 
##                      0 - NA's      Black only, Non-Hispanic 
##                         13565                         32934 
##                      Hispanic     Multiracial, Non-Hispanic 
##                         40998                          9024 
## Other race only, Non-Hispanic      White only, Non-Hispanic 
##                         22004                        307518
print("RaceEthnicityCategory level 0 - NA's %:")
## [1] "RaceEthnicityCategory level 0 - NA's %:"
prev_nas <- table(df$RaceEthnicityCategory)["0 - NA's"]
prev_nas/ prev_rows * 100
## 0 - NA's 
## 3.183951

Usually we can replace them with the mode but as that level only represents 3% of the data and we will still be above 400.000 rows we will simply drop them.

df <- droplevels(df[df$RaceEthnicityCategory != "0 - NA's", ])
table(df$RaceEthnicityCategory)
## 
##      Black only, Non-Hispanic                      Hispanic 
##                         32934                         40998 
##     Multiracial, Non-Hispanic Other race only, Non-Hispanic 
##                          9024                         22004 
##      White only, Non-Hispanic 
##                        307518
print(paste("Predicted rows:",prev_rows-prev_nas))
## [1] "Predicted rows: 412478"
print(paste("Actual rows:   ",nrow(df)))
## [1] "Actual rows:    412478"
str(df)
## 'data.frame':    412478 obs. of  40 variables:
##  $ State                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender                   : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ GeneralHealth            : int  4 5 4 5 2 1 4 3 3 3 ...
##  $ PhysicalHealthDays       : int  0 0 2 0 2 1 0 0 0 1 ...
##  $ MentalHealthDays         : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ LastCheckupTime          : int  5 1 5 5 5 5 5 5 5 5 ...
##  $ PhysicalActivities       : int  0 0 1 1 1 0 1 0 1 1 ...
##  $ SleepHours               : int  8 6 5 7 9 7 7 8 6 7 ...
##  $ RemovedTeeth             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HadHeartAttack           : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ HadAngina                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HadStroke                : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ HadAsthma                : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ HadSkinCancer            : int  0 1 1 0 0 0 0 0 1 0 ...
##  $ HadCOPD                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HadDepressiveDisorder    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HadKidneyDisease         : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ HadArthritis             : int  0 0 0 1 0 0 0 1 1 0 ...
##  $ HadDiabetes              : int  1 0 0 0 0 1 0 0 0 1 ...
##  $ DeafOrHardOfHearing      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BlindOrVisionDifficulty  : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ DifficultyConcentrating  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DifficultyWalking        : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ DifficultyDressingBathing: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DifficultyErrands        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SmokerStatus             : int  0 0 0 1 0 0 1 0 1 0 ...
##  $ ECigaretteUsage          : int  0 1 1 1 1 1 1 1 0 1 ...
##  $ ChestScan                : int  0 0 0 1 1 0 0 1 0 0 ...
##  $ RaceEthnicityCategory    : Factor w/ 5 levels "Black only, Non-Hispanic",..: 5 5 5 5 5 5 1 5 5 5 ...
##  $ AgeCategory              : int  13 13 8 10 5 13 13 13 12 11 ...
##  $ HeightInMeters           : num  1.68 1.6 1.57 1.65 1.57 1.8 1.65 1.63 1.7 1.68 ...
##  $ WeightInKilograms        : num  90.7 68 63.5 63.5 54 ...
##  $ BMI                      : num  26.6 26.6 25.6 23.3 21.8 ...
##  $ AlcoholDrinkers          : int  0 0 0 0 1 0 1 0 0 1 ...
##  $ HIVTesting               : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ FluVaxLast12             : int  1 0 0 1 0 0 0 1 0 1 ...
##  $ PneumoVaxEver            : int  0 0 0 1 1 1 0 1 0 1 ...
##  $ TetanusLast10Tdap        : int  1 1 0 1 1 1 1 1 1 1 ...
##  $ HighRiskLastYear         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CovidPos                 : int  0 0 1 0 0 0 0 0 0 0 ...
summary(df)
##      State           Gender       GeneralHealth   PhysicalHealthDays
##  Min.   : 1.00   Min.   :0.0000   Min.   :1.000   Min.   : 0.000    
##  1st Qu.:16.00   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.: 0.000    
##  Median :27.00   Median :0.0000   Median :4.000   Median : 0.000    
##  Mean   :28.41   Mean   :0.4719   Mean   :3.467   Mean   : 4.068    
##  3rd Qu.:43.00   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.: 3.000    
##  Max.   :54.00   Max.   :1.0000   Max.   :5.000   Max.   :30.000    
##  MentalHealthDays LastCheckupTime PhysicalActivities   SleepHours    
##  Min.   : 0.000   Min.   :1.000   Min.   :0.0000     Min.   : 1.000  
##  1st Qu.: 0.000   1st Qu.:5.000   1st Qu.:1.0000     1st Qu.: 6.000  
##  Median : 0.000   Median :5.000   Median :1.0000     Median : 7.000  
##  Mean   : 4.154   Mean   :4.552   Mean   :0.7699     Mean   : 7.034  
##  3rd Qu.: 4.000   3rd Qu.:5.000   3rd Qu.:1.0000     3rd Qu.: 8.000  
##  Max.   :30.000   Max.   :5.000   Max.   :1.0000     Max.   :24.000  
##   RemovedTeeth    HadHeartAttack     HadAngina         HadStroke      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.4707   Mean   :0.0558   Mean   :0.05887   Mean   :0.04267  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##    HadAsthma      HadSkinCancer        HadCOPD        HadDepressiveDisorder
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000       
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000       
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.0000       
##  Mean   :0.1443   Mean   :0.08153   Mean   :0.07701   Mean   :0.1992       
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000       
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000       
##  HadKidneyDisease   HadArthritis     HadDiabetes     DeafOrHardOfHearing
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000    
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000    
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.00000    
##  Mean   :0.04436   Mean   :0.3335   Mean   :0.1389   Mean   :0.08687    
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000    
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000    
##  BlindOrVisionDifficulty DifficultyConcentrating DifficultyWalking
##  Min.   :0.00000         Min.   :0.0000          Min.   :0.0000   
##  1st Qu.:0.00000         1st Qu.:0.0000          1st Qu.:0.0000   
##  Median :0.00000         Median :0.0000          Median :0.0000   
##  Mean   :0.05136         Mean   :0.1084          Mean   :0.1425   
##  3rd Qu.:0.00000         3rd Qu.:0.0000          3rd Qu.:0.0000   
##  Max.   :1.00000         Max.   :1.0000          Max.   :1.0000   
##  DifficultyDressingBathing DifficultyErrands  SmokerStatus    ECigaretteUsage 
##  Min.   :0.00000           Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000           1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.00000           Median :0.00000   Median :0.0000   Median :1.0000  
##  Mean   :0.03392           Mean   :0.06807   Mean   :0.3667   Mean   :0.7528  
##  3rd Qu.:0.00000           3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.00000           Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##    ChestScan                       RaceEthnicityCategory  AgeCategory    
##  Min.   :0.000   Black only, Non-Hispanic     : 32934    Min.   : 1.000  
##  1st Qu.:0.000   Hispanic                     : 40998    1st Qu.: 5.000  
##  Median :0.000   Multiracial, Non-Hispanic    :  9024    Median : 8.000  
##  Mean   :0.369   Other race only, Non-Hispanic: 22004    Mean   : 7.689  
##  3rd Qu.:1.000   White only, Non-Hispanic     :307518    3rd Qu.:11.000  
##  Max.   :1.000                                           Max.   :13.000  
##  HeightInMeters  WeightInKilograms      BMI        AlcoholDrinkers 
##  Min.   :0.910   Min.   : 22.68    Min.   :14.51   Min.   :0.0000  
##  1st Qu.:1.630   1st Qu.: 68.04    1st Qu.:24.37   1st Qu.:0.0000  
##  Median :1.680   Median : 81.65    Median :26.63   Median :1.0000  
##  Mean   :1.702   Mean   : 81.68    Mean   :27.54   Mean   :0.5841  
##  3rd Qu.:1.780   3rd Qu.: 90.72    3rd Qu.:30.41   3rd Qu.:1.0000  
##  Max.   :2.410   Max.   :285.00    Max.   :40.91   Max.   :1.0000  
##    HIVTesting      FluVaxLast12    PneumoVaxEver    TetanusLast10Tdap
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000   
##  Median :0.0000   Median :1.0000   Median :0.0000   Median :1.0000   
##  Mean   :0.2871   Mean   :0.5785   Mean   :0.3433   Mean   :0.8158   
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   
##  HighRiskLastYear     CovidPos     
##  Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000  
##  Mean   :0.03842   Mean   :0.2774  
##  3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.0000

Since we will be doing some clustering, it’s better to normalize all the numeric variables.

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

factor_i <- which(colnames(df) == 'RaceEthnicityCategory')
df_nor <- as.data.frame(lapply(df[-c(factor_i)], normalize))
df_nor <-cbind(df_nor, df[,factor_i])
colnames(df_nor)[ncol(df_nor)] <- 'RaceEthnicityCategory'
summary(df_nor)
##      State            Gender       GeneralHealth    PhysicalHealthDays
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000    
##  1st Qu.:0.2830   1st Qu.:0.0000   1st Qu.:0.5000   1st Qu.:0.0000    
##  Median :0.4906   Median :0.0000   Median :0.7500   Median :0.0000    
##  Mean   :0.5172   Mean   :0.4719   Mean   :0.6168   Mean   :0.1356    
##  3rd Qu.:0.7925   3rd Qu.:1.0000   3rd Qu.:0.7500   3rd Qu.:0.1000    
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000    
##  MentalHealthDays LastCheckupTime  PhysicalActivities   SleepHours    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000     Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:1.0000     1st Qu.:0.2174  
##  Median :0.0000   Median :1.0000   Median :1.0000     Median :0.2609  
##  Mean   :0.1385   Mean   :0.8881   Mean   :0.7699     Mean   :0.2624  
##  3rd Qu.:0.1333   3rd Qu.:1.0000   3rd Qu.:1.0000     3rd Qu.:0.3043  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000     Max.   :1.0000  
##   RemovedTeeth    HadHeartAttack     HadAngina         HadStroke      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.4707   Mean   :0.0558   Mean   :0.05887   Mean   :0.04267  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##    HadAsthma      HadSkinCancer        HadCOPD        HadDepressiveDisorder
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000       
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000       
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.0000       
##  Mean   :0.1443   Mean   :0.08153   Mean   :0.07701   Mean   :0.1992       
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000       
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000       
##  HadKidneyDisease   HadArthritis     HadDiabetes     DeafOrHardOfHearing
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000    
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000    
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.00000    
##  Mean   :0.04436   Mean   :0.3335   Mean   :0.1389   Mean   :0.08687    
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000    
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000    
##  BlindOrVisionDifficulty DifficultyConcentrating DifficultyWalking
##  Min.   :0.00000         Min.   :0.0000          Min.   :0.0000   
##  1st Qu.:0.00000         1st Qu.:0.0000          1st Qu.:0.0000   
##  Median :0.00000         Median :0.0000          Median :0.0000   
##  Mean   :0.05136         Mean   :0.1084          Mean   :0.1425   
##  3rd Qu.:0.00000         3rd Qu.:0.0000          3rd Qu.:0.0000   
##  Max.   :1.00000         Max.   :1.0000          Max.   :1.0000   
##  DifficultyDressingBathing DifficultyErrands  SmokerStatus    ECigaretteUsage 
##  Min.   :0.00000           Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000           1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.00000           Median :0.00000   Median :0.0000   Median :1.0000  
##  Mean   :0.03392           Mean   :0.06807   Mean   :0.3667   Mean   :0.7528  
##  3rd Qu.:0.00000           3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.00000           Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##    ChestScan      AgeCategory     HeightInMeters   WeightInKilograms
##  Min.   :0.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.000   1st Qu.:0.3333   1st Qu.:0.4800   1st Qu.:0.1729   
##  Median :0.000   Median :0.5833   Median :0.5133   Median :0.2248   
##  Mean   :0.369   Mean   :0.5574   Mean   :0.5281   Mean   :0.2249   
##  3rd Qu.:1.000   3rd Qu.:0.8333   3rd Qu.:0.5800   3rd Qu.:0.2594   
##  Max.   :1.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   
##       BMI         AlcoholDrinkers    HIVTesting      FluVaxLast12   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3735   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.4591   Median :1.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.4935   Mean   :0.5841   Mean   :0.2871   Mean   :0.5785  
##  3rd Qu.:0.6023   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  PneumoVaxEver    TetanusLast10Tdap HighRiskLastYear     CovidPos     
##  Min.   :0.0000   Min.   :0.0000    Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.0000    1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000    Median :0.00000   Median :0.0000  
##  Mean   :0.3433   Mean   :0.8158    Mean   :0.03842   Mean   :0.2774  
##  3rd Qu.:1.0000   3rd Qu.:1.0000    3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000    Max.   :1.00000   Max.   :1.0000  
##                    RaceEthnicityCategory
##  Black only, Non-Hispanic     : 32934   
##  Hispanic                     : 40998   
##  Multiracial, Non-Hispanic    :  9024   
##  Other race only, Non-Hispanic: 22004   
##  White only, Non-Hispanic     :307518   
## 

As clustering is unsupervised machine learning we need to ignore RaceEthnicityCategory. I will start with hclust and as the full dataset is to big I take small samples. But to prove the algorithm I will test every distance and agglomeration method. We have to make a little “trick”, because the new sample dataset is random we need to take a look at how many ethnicities we have beforehand to assign the correct value to k.

set.seed(123)
idx <- sample(1:dim(df_nor)[1], 50)
df_sample <- df_nor[idx,]

clusters <- length(unique(df_sample$RaceEthnicityCategory))

distances <- c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowsky")
methods <- c("complete", "average", "single", "ward.D", "ward.D2", "median", "centroid", "mcquitty")

for (i in 1:length(distances)) {
  for (j in 1:length(methods)) {
    hc <- hclust(dist(df_sample[,1:39], method="euclidean"), method=methods[[j]])
    plot(hc, hang = -1, cex = 0.75, labels=df_sample$RaceEthnicityCategory, main =
           paste("Clusters:", clusters, "Distance:", distances[[i]],"Method:",methods[[j]]))
    rect.hclust(hc, k=clusters, border = 2:(clusters+1))
  }
}

We can clearly see that there is no combination that groups correctly by ethnicity. This can be because the random sample we took, so lets try some others.

for (x in 1:10) {
  set.seed(x)
  idx <- sample(1:dim(df_nor)[1], 50)
  df_sample <- df_nor[idx,]
  
  clusters <- length(unique(df_sample$RaceEthnicityCategory))
  
  distances <- c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowsky")
  methods <- c("complete", "average", "single", "ward.D", "ward.D2", "median", "centroid", "mcquitty")
  
  for (i in 1:length(distances)) {
    for (j in 1:length(methods)) {
      hc <- hclust(dist(df_sample[,1:39], method="euclidean"), method=methods[[j]])
      plot(hc, hang = -1, cex = 0.75, labels=df_sample$RaceEthnicityCategory, main =
             paste("Seed:", x, "Clusters:", clusters, "Distance:", distances[[i]],"Method:",methods[[j]]))
      rect.hclust(hc, k=clusters, border = 2:(clusters+1))
    }
  }
}

Once again there is no model that groups the data correctly. Before trying another approach I will try to reduce the variables just to the diseases.

set.seed(123)
idx <- sample(1:dim(df_nor)[1], 50)
df_sample <- df_nor[idx,]

clusters <- length(unique(df_sample$RaceEthnicityCategory))

distances <- c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowsky")
methods <- c("complete", "average", "single", "ward.D", "ward.D2", "median", "centroid", "mcquitty")

for (i in 1:length(distances)) {
  for (j in 1:length(methods)) {
    hc <- hclust(dist(df_sample[,10:19], method="euclidean"), method=methods[[j]])
    plot(hc, hang = -1, cex = 0.75, labels=df_sample$RaceEthnicityCategory, main =
           paste("Clusters:", clusters, "Distance:", distances[[i]],"Method:",methods[[j]]))
    rect.hclust(hc, k=clusters, border = 2:(clusters+1))
  }
}

Now I will try to reduce the columns only taking into account one disease. As we see previously in the logistic regressions “HadSkinCancer” is the best candidate.

for (i in 1:ncol(df_nor)) {
  print(paste(i,colnames(df_nor)[i]))
}
## [1] "1 State"
## [1] "2 Gender"
## [1] "3 GeneralHealth"
## [1] "4 PhysicalHealthDays"
## [1] "5 MentalHealthDays"
## [1] "6 LastCheckupTime"
## [1] "7 PhysicalActivities"
## [1] "8 SleepHours"
## [1] "9 RemovedTeeth"
## [1] "10 HadHeartAttack"
## [1] "11 HadAngina"
## [1] "12 HadStroke"
## [1] "13 HadAsthma"
## [1] "14 HadSkinCancer"
## [1] "15 HadCOPD"
## [1] "16 HadDepressiveDisorder"
## [1] "17 HadKidneyDisease"
## [1] "18 HadArthritis"
## [1] "19 HadDiabetes"
## [1] "20 DeafOrHardOfHearing"
## [1] "21 BlindOrVisionDifficulty"
## [1] "22 DifficultyConcentrating"
## [1] "23 DifficultyWalking"
## [1] "24 DifficultyDressingBathing"
## [1] "25 DifficultyErrands"
## [1] "26 SmokerStatus"
## [1] "27 ECigaretteUsage"
## [1] "28 ChestScan"
## [1] "29 AgeCategory"
## [1] "30 HeightInMeters"
## [1] "31 WeightInKilograms"
## [1] "32 BMI"
## [1] "33 AlcoholDrinkers"
## [1] "34 HIVTesting"
## [1] "35 FluVaxLast12"
## [1] "36 PneumoVaxEver"
## [1] "37 TetanusLast10Tdap"
## [1] "38 HighRiskLastYear"
## [1] "39 CovidPos"
## [1] "40 RaceEthnicityCategory"
set.seed(123)
idx <- sample(1:dim(df_nor)[1], 50)
df_sample <- df_nor[idx,]

clusters <- length(unique(df_sample$RaceEthnicityCategory))

distances <- c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowsky")
methods <- c("complete", "average", "single", "ward.D", "ward.D2", "median", "centroid", "mcquitty")

for (i in 1:length(distances)) {
  for (j in 1:length(methods)) {
    hc <- hclust(dist(df_sample[,-c(10:13,15:19,40)], method="euclidean"), method=methods[[j]])
    plot(hc, hang = -1, cex = 0.75, labels=df_sample$RaceEthnicityCategory, main =
           paste("Only \"HadSkinCancer\" Clusters:", clusters, "Distance:", distances[[i]],"Method:",methods[[j]]))
    rect.hclust(hc, k=clusters, border = 2:(clusters+1))
  }
}

After all of this we can clearly conclude hclust is not capable to separate our data by ethnicities.

As this file is getting a bit too long I will save the normalized dataset and continue with other clustering algorithms in a new file.

write.csv(df_nor, "FPdata_NOR.csv", row.names = FALSE)